import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import category_encoders as ce
import warnings
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import recall_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SequentialFeatureSelector
warnings.simplefilter(action="ignore", category=FutureWarning)
vehicles = pd.read_csv('data/vehicles.csv')
vehicles.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 426880 entries, 0 to 426879 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 426880 non-null int64 1 region 426880 non-null object 2 price 426880 non-null int64 3 year 425675 non-null float64 4 manufacturer 409234 non-null object 5 model 421603 non-null object 6 condition 252776 non-null object 7 cylinders 249202 non-null object 8 fuel 423867 non-null object 9 odometer 422480 non-null float64 10 title_status 418638 non-null object 11 transmission 424324 non-null object 12 VIN 265838 non-null object 13 drive 296313 non-null object 14 size 120519 non-null object 15 type 334022 non-null object 16 paint_color 296677 non-null object 17 state 426880 non-null object dtypes: float64(2), int64(2), object(14) memory usage: 58.6+ MB
vehicles = vehicles.convert_dtypes()
original_row_count = vehicles.shape[0]
# CALC: % of null values
vehicles.isnull().sum()/vehicles.shape[0]*100
id 0.000000 region 0.000000 price 0.000000 year 0.282281 manufacturer 4.133714 model 1.236179 condition 40.785232 cylinders 41.622470 fuel 0.705819 odometer 1.030735 title_status 1.930753 transmission 0.598763 VIN 37.725356 drive 30.586347 size 71.767476 type 21.752717 paint_color 30.501078 state 0.000000 dtype: float64
# remove a few features (columns) that are not relavent to the analysis
vehicles.drop(columns = ['id','region','VIN','state'], axis=1, inplace = True)
# before dropping NaN's
px.imshow(vehicles.isnull())